{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ef046c63",
   "metadata": {},
   "source": [
    "# 环境准备"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "41392daa",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-29T17:29:21.375633Z",
     "start_time": "2024-10-29T17:29:20.104389Z"
    }
   },
   "outputs": [],
   "source": [
    "import toad\n",
    "import warnings\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "pd.set_option('display.width', 10000)\n",
    "pd.set_option('display.max_rows', None)\n",
    "pd.set_option('display.max_columns', None)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "27cba178",
   "metadata": {},
   "source": [
    "# 数据准备"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "cb620578",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-29T17:29:38.863013Z",
     "start_time": "2024-10-29T17:29:38.694669Z"
    }
   },
   "outputs": [],
   "source": [
    "train = pd.read_csv('../001.Dataset/train.csv')\n",
    "test = pd.read_csv('../001.Dataset/test.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e11f9439",
   "metadata": {},
   "source": [
    "# 特征分箱"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "20539c30",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-29T17:36:38.915484Z",
     "start_time": "2024-10-29T17:36:38.860695Z"
    }
   },
   "outputs": [],
   "source": [
    "# 客户年龄\n",
    "colname = '客户年龄'\n",
    "bins = [28.5, 30.5, 34.5, 36.5, 38.5, 49.5, 57.5]\n",
    "test[colname] = np.digitize(test[colname], bins=bins, right=True)\n",
    "train[colname] = np.digitize(train[colname], bins=bins, right=True)\n",
    "\n",
    "# 最后一次通话时间\n",
    "colname = '最后一次通话时间'\n",
    "bins = [62.5, 77.5, 94.5, 113.5, 129.5, 145.5, 162.5, 180.5, 205.5, 249.5, 322.5, 393.5, 524.5, 730.5]\n",
    "test[colname] = np.digitize(test[colname], bins=bins, right=True)\n",
    "train[colname] = np.digitize(train[colname], bins=bins, right=True)\n",
    "\n",
    "# 本次营销活动联系次数\n",
    "colname = '本次营销活动联系次数'\n",
    "bins = [1.5, 2.5, 3.5, 4.5, 6.5]\n",
    "test[colname] = np.digitize(test[colname], bins=bins, right=True)\n",
    "train[colname] = np.digitize(train[colname], bins=bins, right=True)\n",
    "\n",
    "# 距离上次联系间隔天数\n",
    "colname = '距离上次联系间隔天数'\n",
    "bins = [998.5]\n",
    "test[colname] = np.digitize(test[colname], bins=bins, right=True)\n",
    "train[colname] = np.digitize(train[colname], bins=bins, right=True)\n",
    "\n",
    "# 上次营销活动联系次数\n",
    "colname = '上次营销活动联系次数'\n",
    "bins = [0.5]\n",
    "test[colname] = np.digitize(test[colname], bins=bins, right=True)\n",
    "train[colname] = np.digitize(train[colname], bins=bins, right=True)\n",
    "\n",
    "# 就业变化率\n",
    "colname = '就业变化率'\n",
    "bins = [-2.35 -0.65]\n",
    "test[colname] = np.digitize(test[colname], bins=bins, right=True)\n",
    "train[colname] = np.digitize(train[colname], bins=bins, right=True)\n",
    "\n",
    "# 消费者价格指数\n",
    "colname = '消费者价格指数'\n",
    "bins = [92.868]\n",
    "test[colname] = np.digitize(test[colname], bins=bins, right=True)\n",
    "train[colname] = np.digitize(train[colname], bins=bins, right=True)\n",
    "\n",
    "# 消费者信息指数\n",
    "colname = '消费者信息指数'\n",
    "bins = [-35.44999886]\n",
    "test[colname] = np.digitize(test[colname], bins=bins, right=True)\n",
    "train[colname] = np.digitize(train[colname], bins=bins, right=True)\n",
    "\n",
    "# euribor 3月费率\n",
    "colname = 'euribor 3月费率'\n",
    "bins = [1.23949999, 3.16750002]\n",
    "test[colname] = np.digitize(test[colname], bins=bins, right=True)\n",
    "train[colname] = np.digitize(train[colname], bins=bins, right=True)\n",
    "\n",
    "# 员工人数\n",
    "colname = '员工人数'\n",
    "bins = [5087.65014648, 5183.64990234]\n",
    "test[colname] = np.digitize(test[colname], bins=bins, right=True)\n",
    "train[colname] = np.digitize(train[colname], bins=bins, right=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d859c672",
   "metadata": {},
   "source": [
    "# 计算特征信息量"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "26029aa0",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-29T17:39:17.572054Z",
     "start_time": "2024-10-29T17:39:17.179201Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>iv</th>\n",
       "      <th>gini</th>\n",
       "      <th>entropy</th>\n",
       "      <th>unique</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>最后一次通话时间</th>\n",
       "      <td>1.847629</td>\n",
       "      <td>0.172726</td>\n",
       "      <td>0.319426</td>\n",
       "      <td>15.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>员工人数</th>\n",
       "      <td>1.139179</td>\n",
       "      <td>0.167645</td>\n",
       "      <td>0.292240</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>euribor 3月费率</th>\n",
       "      <td>1.094396</td>\n",
       "      <td>0.169296</td>\n",
       "      <td>0.294885</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>距离上次联系间隔天数</th>\n",
       "      <td>0.551306</td>\n",
       "      <td>0.178825</td>\n",
       "      <td>0.321195</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>上次营销活动结果</th>\n",
       "      <td>0.547671</td>\n",
       "      <td>0.179391</td>\n",
       "      <td>0.321645</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>上次联系月份</th>\n",
       "      <td>0.485117</td>\n",
       "      <td>0.184873</td>\n",
       "      <td>0.325622</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>消费者信息指数</th>\n",
       "      <td>0.399300</td>\n",
       "      <td>0.186569</td>\n",
       "      <td>0.329599</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>消费者价格指数</th>\n",
       "      <td>0.326540</td>\n",
       "      <td>0.189160</td>\n",
       "      <td>0.333684</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>上次营销活动联系次数</th>\n",
       "      <td>0.277797</td>\n",
       "      <td>0.192440</td>\n",
       "      <td>0.337028</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>联系方式</th>\n",
       "      <td>0.251663</td>\n",
       "      <td>0.195736</td>\n",
       "      <td>0.340383</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>工作类型</th>\n",
       "      <td>0.188713</td>\n",
       "      <td>0.195261</td>\n",
       "      <td>0.342170</td>\n",
       "      <td>12.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>就业变化率</th>\n",
       "      <td>0.186622</td>\n",
       "      <td>0.193418</td>\n",
       "      <td>0.341429</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>客户年龄</th>\n",
       "      <td>0.170257</td>\n",
       "      <td>0.195667</td>\n",
       "      <td>0.343034</td>\n",
       "      <td>8.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>是否有过违约</th>\n",
       "      <td>0.127898</td>\n",
       "      <td>0.197953</td>\n",
       "      <td>0.346254</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>本次营销活动联系次数</th>\n",
       "      <td>0.063204</td>\n",
       "      <td>0.198915</td>\n",
       "      <td>0.349143</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>教育水平</th>\n",
       "      <td>0.048576</td>\n",
       "      <td>0.198989</td>\n",
       "      <td>0.349639</td>\n",
       "      <td>8.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>婚姻状况</th>\n",
       "      <td>0.028215</td>\n",
       "      <td>0.199331</td>\n",
       "      <td>0.350595</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>上次联系周几</th>\n",
       "      <td>0.006493</td>\n",
       "      <td>0.199800</td>\n",
       "      <td>0.351707</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>是否有住房贷款</th>\n",
       "      <td>0.001383</td>\n",
       "      <td>0.199899</td>\n",
       "      <td>0.351960</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>是否有个人贷款</th>\n",
       "      <td>0.000269</td>\n",
       "      <td>0.199921</td>\n",
       "      <td>0.352015</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    iv      gini   entropy  unique\n",
       "最后一次通话时间      1.847629  0.172726  0.319426    15.0\n",
       "员工人数          1.139179  0.167645  0.292240     3.0\n",
       "euribor 3月费率  1.094396  0.169296  0.294885     3.0\n",
       "距离上次联系间隔天数    0.551306  0.178825  0.321195     2.0\n",
       "上次营销活动结果      0.547671  0.179391  0.321645     3.0\n",
       "上次联系月份        0.485117  0.184873  0.325622    10.0\n",
       "消费者信息指数       0.399300  0.186569  0.329599     2.0\n",
       "消费者价格指数       0.326540  0.189160  0.333684     2.0\n",
       "上次营销活动联系次数    0.277797  0.192440  0.337028     2.0\n",
       "联系方式          0.251663  0.195736  0.340383     2.0\n",
       "工作类型          0.188713  0.195261  0.342170    12.0\n",
       "就业变化率         0.186622  0.193418  0.341429     2.0\n",
       "客户年龄          0.170257  0.195667  0.343034     8.0\n",
       "是否有过违约        0.127898  0.197953  0.346254     3.0\n",
       "本次营销活动联系次数    0.063204  0.198915  0.349143     6.0\n",
       "教育水平          0.048576  0.198989  0.349639     8.0\n",
       "婚姻状况          0.028215  0.199331  0.350595     4.0\n",
       "上次联系周几        0.006493  0.199800  0.351707     5.0\n",
       "是否有住房贷款       0.001383  0.199899  0.351960     3.0\n",
       "是否有个人贷款       0.000269  0.199921  0.352015     3.0"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "toad.quality(train, target='是否订阅', iv_only=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "128d3623",
   "metadata": {},
   "source": [
    "# 特征稳定性"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "a6c63a94",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-10-29T17:40:03.136505Z",
     "start_time": "2024-10-29T17:40:03.042711Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>客户年龄</td>\n",
       "      <td>0.0006</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>工作类型</td>\n",
       "      <td>0.0027</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>婚姻状况</td>\n",
       "      <td>0.0004</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>教育水平</td>\n",
       "      <td>0.0014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>是否有过违约</td>\n",
       "      <td>0.0014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>是否有住房贷款</td>\n",
       "      <td>0.0002</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>是否有个人贷款</td>\n",
       "      <td>0.0008</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>联系方式</td>\n",
       "      <td>0.0004</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>上次联系月份</td>\n",
       "      <td>0.0038</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>上次联系周几</td>\n",
       "      <td>0.0005</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>最后一次通话时间</td>\n",
       "      <td>0.0020</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>本次营销活动联系次数</td>\n",
       "      <td>0.0011</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>距离上次联系间隔天数</td>\n",
       "      <td>0.0001</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>上次营销活动联系次数</td>\n",
       "      <td>0.0005</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>上次营销活动结果</td>\n",
       "      <td>0.0006</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>就业变化率</td>\n",
       "      <td>0.0000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>消费者价格指数</td>\n",
       "      <td>0.0001</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>消费者信息指数</td>\n",
       "      <td>0.0001</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>euribor 3月费率</td>\n",
       "      <td>0.0003</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>员工人数</td>\n",
       "      <td>0.0005</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>是否订阅</td>\n",
       "      <td>0.0001</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           index       0\n",
       "0           客户年龄  0.0006\n",
       "1           工作类型  0.0027\n",
       "2           婚姻状况  0.0004\n",
       "3           教育水平  0.0014\n",
       "4         是否有过违约  0.0014\n",
       "5        是否有住房贷款  0.0002\n",
       "6        是否有个人贷款  0.0008\n",
       "7           联系方式  0.0004\n",
       "8         上次联系月份  0.0038\n",
       "9         上次联系周几  0.0005\n",
       "10      最后一次通话时间  0.0020\n",
       "11    本次营销活动联系次数  0.0011\n",
       "12    距离上次联系间隔天数  0.0001\n",
       "13    上次营销活动联系次数  0.0005\n",
       "14      上次营销活动结果  0.0006\n",
       "15         就业变化率  0.0000\n",
       "16       消费者价格指数  0.0001\n",
       "17       消费者信息指数  0.0001\n",
       "18  euribor 3月费率  0.0003\n",
       "19          员工人数  0.0005\n",
       "20          是否订阅  0.0001"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "toad.metrics.PSI(train, test).round(4).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "afc44dd2",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "ML",
   "language": "python",
   "name": "ml"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.10"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "277.6px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
